##############################
# Mutual information 
# Europe, over time
#
# Cederman, Girardin, Müller-Crepon, 2023
##############################


# GLOBALS
fig.path <- ""

# INIT

# Packages

library(infotheo)
library(data.table)
library(ggplot2)

# Functions

## Mutual Information
mi_norm <- function(X, Y){
  mutinformation(X, Y) / sqrt(infotheo::entropy(X) * infotheo::entropy(Y))
}



# DATA #####

# Load data
read.csv("mutinfo_data.csv")

# MEASURE MUTUAL INFORMATION 

# To data.table
agg.tb <- data.table(data)
agg.tb <- na.omit(agg.tb)

# Drop points that are not always in the data
N.tb <- agg.tb[,.(obs = max(c(sum(!is.na(EthnicId)), sum(!is.na(StateId))))),
               by = c("CellId")]
max.tb <- N.tb[,.(maxobs = max(obs))]
agg.tb <- N.tb[agg.tb, , on = c("CellId")]
agg.tb <- agg.tb[obs == max.tb$maxobs, ]

# Add 1886 info

## 1886 data
past.tb <- agg.tb[agg.tb$Year == 1886,]
past.tb$PastEthnicId <- past.tb$EthnicId
past.tb$PastStateId <- past.tb$StateId

## Merge
agg.tb <- past.tb[, c("CellId", "PastStateId", "PastEthnicId")][agg.tb,
                                                                , on = c("CellId")]





# 2 x 2 Plot: What is changing? Ethnic geography, state borders, or both? #####


# Collapse w/ Mutual Information by year

## First Quadrant
q1.tb <- agg.tb[, .(value = mi_norm(PastStateId, PastEthnicId)),
                by = c("Year")]
q1.tb$measure = "Mutual Information"
q1.tb$ethnic.info = "1886 ethnicity"
q1.tb$state.info = "1886 state borders"

## Second Quadrant
q2.tb <- agg.tb[, .(value = mi_norm(PastStateId, EthnicId)),
                by = c("Year")]
q2.tb$measure = "Mutual Information"
q2.tb$ethnic.info = "Time-varying ethnicity"
q2.tb$state.info = "1886 state borders"

## Third Quadrant
q3.tb <- agg.tb[, .(value = mi_norm(StateId, PastEthnicId)),
                by = c("Year")]
q3.tb$measure = "Mutual Information"
q3.tb$ethnic.info = "1886 ethnicity"
q3.tb$state.info = "Time-varying state borders"

## Fourth Quadrant
q4.tb <- agg.tb[, .(value = mi_norm(StateId, EthnicId)),
                by = c("Year")]
q4.tb$measure = "Mutual Information"
q4.tb$ethnic.info = "Time-varying ethnicity"
q4.tb$state.info = "Time-varying state borders"


# Plot

## Make plot data by stacking q1..q4
plot.df <- rbind(q1.tb, q2.tb, q3.tb, q4.tb)

## Make factor labels
plot.df$ethnic.info <- factor(plot.df$ethnic.info, 
                              levels = unique(plot.df$ethnic.info),
                              ordered = T)
plot.df$state.info <- factor(plot.df$state.info, 
                              levels = unique(plot.df$state.info),
                              ordered = T)

## Make plot
g <- ggplot(plot.df, 
            aes(x = Year, y = value)) +
  geom_line() + 
  theme_minimal() +
  xlab("Year") + ylab("Mutual information") +
  scale_y_continuous(position = "right", limits = c(.6, 1)) +
  theme(legend.position = "top",
        panel.spacing = unit(2, "lines")) +
  facet_grid(state.info ~ ethnic.info, switch = "y")

## Save plot
png(file.path(fig.path, "stateethnic_mutinfo_ts.png"), width = 6, height = 6, res = 400, units = "in")
par(mar = c(0,0,0,0))
plot(g)
dev.off()


